#load libraries
library(tidyverse)
## Warning: package 'tidyverse' was built under R version 4.1.3
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v ggplot2 3.3.5 v purrr 0.3.4
## v tibble 3.1.6 v dplyr 1.0.8
## v tidyr 1.2.0 v stringr 1.4.0
## v readr 2.1.2 v forcats 0.5.1
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(ggrepel)
## Warning: package 'ggrepel' was built under R version 4.1.3
library(ggthemes)
## Warning: package 'ggthemes' was built under R version 4.1.3
library(scales)
## Warning: package 'scales' was built under R version 4.1.3
##
## Attaching package: 'scales'
## The following object is masked from 'package:purrr':
##
## discard
## The following object is masked from 'package:readr':
##
## col_factor
library(plotly)
## Warning: package 'plotly' was built under R version 4.1.3
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
library(lattice)
## Warning: package 'lattice' was built under R version 4.1.3
library(GGally)
## Warning: package 'GGally' was built under R version 4.1.3
## Registered S3 method overwritten by 'GGally':
## method from
## +.gg ggplot2
library(dplyr)
cars <- mtcars
ggplot(data = cars, aes(cyl)) +
geom_histogram(binwidth = 1, fill = 'red')
# install this package:
#
#install.packages("wooldridge")
library(wooldridge)
## Warning: package 'wooldridge' was built under R version 4.1.3
help(attend)
## starting httpd help server ... done
df <- attend
xyplot(attend ~ termGPA, df)
##This plot shows the coorelation between term GPA and attendance of the students.
ggplot(data = df, aes(x=priGPA, y=missed)) +
geom_point(size=2, color = 'blue') +
facet_grid(cols = vars(soph), labeller = label_both) +
theme_bw() +
labs(title = 'Not Sophomore vs. Sophomore', x = "Missed Classes", y = "Prior GPA", caption = "The third variable soph dictates 0 as 'not sophomore' and 1 as 'sophomore'")
## Question 4: Generate a bar plot for the count of ACT scores from “df”. What type of distribution does this plot remind you of?
ggplot(df, aes(ACT)) +
geom_bar(binwidth = 1)
## Warning: Ignoring unknown parameters: binwidth
## This plot reminds me of a symmetrical distribution.
ggplot(data = df, aes(x = ACT, y = final)) +
geom_bar(stat = 'identity', aes(fill=factor(soph)), position = "dodge")
ggplot(data = df, aes(x = ACT, y = final)) +
geom_bar(stat = 'identity', aes(fill=factor(soph)))
`` #The dodge positon helps me notice the differences in relationships more. The plot is easier to read a frequency difference, but it may be better to view the plot in a stack position to compare distribution. I would say that for both non sophomores an dsophomores, the data seems to be relatively symmetrical. However, it seems that those who got abetter grade on the final did better on the ACT.
ggplot(data = df, aes(x = factor(ACT), y = attend)) +
geom_boxplot(aes(group = factor(frosh)))
#Yes, the relationship between x and y is the same for both groups.
ggplot(df, aes(x = ACT, y = attend)) +
geom_point() +
facet_wrap(vars(frosh))
#The group difference become more clear. It appears that attendance does not seem to have a majopr effect on ACT scores.
ggpairs(df %>% select(attend, termGPA, priGPA, ACT))
#The pair of variables that have the highest coorelation are priGPA and termGPA.The coorelation statistic is 0.653. This makes sense as students tend to be consistent with their GPA from previous semesters. The next set of variables that have high coorelation is termGPA and attendance. According to the data, the more ffective way to increase GPA is to have a higher homework turn in rate. This is because previous GPA has the highest cooreleation for the term GPA. Turning in homework increases GPA, and has a stronger coorelation than having higher attendance.
plot <- ggplot(data = df, aes(x = priGPA, y = termGPA)) +
geom_point(aes(color =factor(soph))) +
geom_smooth(method=lm) + geom_smooth(aes(x = soph), method = lm)
#The strength of relationships between priGPA and termGPA seems to be stronger for the sophmore group. The points are closer to the regression line.
ggplotly(plot)
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
plot + geom_label_repel(aes(label = soph))
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## Warning: ggrepel: 637 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
ggsave(plot, filename = 'myplot.png')
## Saving 7 x 5 in image
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'